List_in

Total prevalence type & Total production quantity

Total prevalence type & Total production quantity

Duration

Row

Movie Duration vs. Decade

TV Show Duration vs. Decade

Row

Movie Duration vs. Genre

TV Show Duration vs. Genre

Rating

Column

The rating category under different type

rating catergory over each year

The distribution of rating

Country

Column

Country vs Types

Keywords

cloud1

cloud2

---
title: "Project"
author: ""
date: "2023-12-11"
output:   
  flexdashboard::flex_dashboard:
    orientation: rows
    social: ["wechat"]
    theme: united
    source_code: embed
---

```{r setup, include=FALSE}
library(ggplot2)
library(plotly)
library(dplyr)
library(stringr)
library(tidyr)
library(flexdashboard)
library(knitr)
library(tidyverse)
library(purrr) # for functional programming
library(wordcloud2) # for creating word cloud
```

```{r}
df <- read.csv("data/netflix_titles.csv")
```


# List_in {.storyboard}
### Total prevalence type & Total production quantity 
```{r}
data <- df %>% mutate(release_year = as.numeric(str_sub(release_year, -4))) # get years
data <- data %>% separate_rows(listed_in, sep = ", ") %>% # Separate the listed in lines by commas
  arrange(release_year) # Sort by year
```

```{r}
# Statistics of the top ten quantity types each year
top10_types <- data %>%
  group_by(release_year, listed_in) %>%
  summarise(type_count = n()) %>%
  arrange(release_year, desc(type_count)) %>%
  group_by(release_year) %>%
  top_n(10)

# top10_types
```

```{r}
# List number of times
grouped_table <- top10_types %>%
  group_by(listed_in) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
# grouped_table
```


```{r}
colnames(grouped_table) <- c("Type_name", "epidemic_years")


p1 <- ggplot(data = grouped_table, mapping = aes(
  x = reorder(Type_name, -epidemic_years),
  y = epidemic_years, fill = epidemic_years
)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(low = "#221f1f", high = "#b20710") + # Set color gradient
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Total prevalence type", x = "type", y = "The number of epidemic years of a type")

ggplotly(p1)
```

```{r}
total_table <- top10_types %>%
  group_by(listed_in) %>%
  summarise(total_count = sum(type_count), .groups = "drop")

# total_table

# The summary table is arranged according to the number of occurrences from largest to smallest
total_table <- total_table %>% arrange(desc(total_count))
# colnames(total_table) <- c("Type name", "List times")

# total_table
```

### Total prevalence type & Total production quantity 
```{r}
colnames(total_table) <- c("Type_name", "Production_volume")
# Summary of the number of production types
p2 <- ggplot(data = total_table, mapping = aes(
  x = reorder(Type_name, -Production_volume),
  y = Production_volume, fill = Production_volume
)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(high = "#b20710", low = "#221f1f") + # Set color gradient
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Total production quantity", x = "type", y = "Production volume")

ggplotly(p2)
```


Duration
============================

Row
-------------------

### Movie Duration  vs. Decade

```{r}
data <- subset(df, type == "Movie")
data$duration <- as.numeric(str_extract(data$duration, "\\d+"))

data$decade <- cut(data$release_year, breaks = seq(1920, 2040, by = 10), labels = seq(1920, 2030, by = 10))

ggplot(data %>% filter(duration <= 200), aes(x = decade, y = duration)) +
  geom_boxplot(fill = "#b20710", color = "#221f1f", notch = TRUE) +
  labs(
    title = "Duration Distribution of Movies by Decade",
    x = "Decade",
    y = "Duration"
  ) +
  theme_minimal() +
  scale_y_continuous(breaks = seq(0, 200, 50), labels = paste0(seq(0, 200, 50), " minutes"))
```

### TV Show Duration  vs. Decade

```{r}
data <- subset(df, type == "TV Show")
data$duration <- as.numeric(str_extract(data$duration, "\\d+"))

data$decade <- cut(data$release_year, breaks = seq(1920, 2040, by = 10), labels = seq(1920, 2030, by = 10))

ggplot(data, aes(x = decade, y = duration)) +
  geom_boxplot(fill = "#b20710", color = "#221f1f", notch = TRUE) +
  labs(
    title = "Duration Distribution of TV Show by Decade",
    x = "Decade",
    y = "Duration"
  ) +
  theme_minimal() +
  scale_y_continuous(breaks = seq(0, 20, 5), labels = paste0(seq(0, 20, 5), " seasons"))
```

Row
-------------------

### Movie Duration  vs. Genre

```{r}
data <- df %>%
  filter(type == "Movie") %>%
  mutate(duration = as.numeric(str_extract(duration, "\\d+"))) %>%
  separate_rows(listed_in, sep = ", ") %>%
  group_by(listed_in) %>%
  summarise(avg_duration = mean(duration, na.rm = TRUE))

ggplot(data, aes(x = reorder(listed_in, avg_duration), y = avg_duration, fill = avg_duration)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Average Duration of Movies by Genre",
    x = "Genre",
    y = "Average Duration (minutes)",
    fill = "Average Duration (minutes)"
  ) +
  theme_minimal() +
  scale_y_continuous(breaks = seq(0, 150, 20), labels = seq(0, 150, 20)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  coord_flip() +
  scale_fill_gradient(low = "#221f1f", high = "#b20710")
```

### TV Show Duration  vs. Genre

```{r}
data <- df %>%
  filter(type == "TV Show") %>%
  mutate(duration = as.numeric(str_extract(duration, "\\d+"))) %>%
  separate_rows(listed_in, sep = ", ") %>%
  group_by(listed_in) %>%
  summarise(avg_duration = mean(duration, na.rm = TRUE))

# 绘制图表
ggplot(data, aes(x = reorder(listed_in, avg_duration), y = avg_duration, fill = avg_duration)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Average Duration of TV Shows by Genre",
    x = "Genre",
    y = "Average Duration (seasons)",
    fill = "Average Duration (seasons)"
  ) +
  theme_minimal() +
  scale_y_continuous(breaks = seq(0, 7, 1), labels = seq(0, 7, 1)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  coord_flip() +
  scale_fill_gradient(high = "#221f1f", low = "#b20710")
```

Rating
=====================================
## Column {.tabset data-width="650"}
```{r include=FALSE}
mydata <- df[df$rating != "", ]

l <- unique(mydata$rating)
l

plotdata <- mydata %>%
  group_by(rating) %>%
  summarize(n = n())
plotdata
```
### The rating category under different type

```{r}
plotdata <- mydata %>%
  group_by(type, rating) %>%
  summarize(n = n())

category_type_data <- plotdata %>%
  mutate(category = case_when(
    rating %in% c("G", "TV-G", "TV-Y") ~ "Little Kids",
    rating %in% c("PG", "TV-PG", "TV-Y7") ~ "Older Kids",
    rating %in% c("TV-Y7-FV", "PG-13", "TV-14") ~ "Teenagers",
    rating %in% c("NC-17", "NR", "R", "TV-MA", "UR") ~ "Adults",
    TRUE ~ "Other" # Add a default category for any unexpected ratings
  ))


category_type_data$category <- factor(category_type_data$category,
  levels = c("Little Kids", "Older Kids", "Teenagers", "Adults"),
  ordered = TRUE
)

p <- category_type_data %>%
  ggplot(aes(rating, n, fill = type, label = n)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(position = position_stack(vjust = 0.5), color = "white", size = 3) +
  labs(x = "Rating", y = "Count", title = "Distribution of Ratings with Type") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    panel.border = element_blank(),
    strip.background = element_blank(),
    strip.text = element_text(face = "bold"),
    strip.text.x = element_text(margin = margin(b = 10)),
    legend.position = "right"
  ) +
  facet_grid(. ~ category, scales = "free_x", labeller = label_both) +
  scale_fill_manual(values = c("#b20710", "#221f1f")) +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1)
  )

p
```

### rating catergory over each year
```{r}
plotdata <- mydata %>%
  group_by(rating, release_year) %>%
  summarize(n = n())

p3 <- plotdata %>%
  filter(release_year >= 2000) %>%
  mutate(category = case_when(
    rating %in% c("G", "TV-G", "TV-Y") ~ "Little Kids",
    rating %in% c("PG", "TV-PG", "TV-Y7") ~ "Older Kids",
    rating %in% c("TV-Y7-FV", "PG-13", "TV-14") ~ "Teenagers",
    rating %in% c("NC-17", "NR", "R", "TV-MA", "UR") ~ "Adults",
    TRUE ~ "Other" # Add a default category for any unexpected ratings
  )) %>%
  ggplot(aes(release_year, n, fill = category)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_y_continuous(labels = scales::percent_format(scale = 100)) +
  labs(
    x = "Release Year",
    y = "Count (Percentage)",
    title = "Distribution of Ratings Over Years"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(values = c("#221f1f", "#5e4d4d", "#b20710", "#ec8b90", "#ec8b06"))

p3_plotly <- ggplotly(p3)
p3_plotly
```



### The distribution of rating
```{r}
### The distribution of rating
plotdata <- mydata %>%
  group_by(rating) %>%
  summarize(n = n())

# bar chart with rotated labels
p1 <- ggplot(
  plotdata,
  aes(
    x = reorder(rating, -n),
    y = n,
    fill = rating
  )
) +
  geom_bar(stat = "identity") +
  labs(
    x = "Rating",
    y = "Count",
    title = "The distribution of rating"
  ) +
  theme(axis.text.x = element_text(
    angle = 45,
    hjust = 1
  )) +
  scale_fill_manual(values = c(
    "#221f1f", "#301D1C", "#3E1B1B", "#4C1919", "#5A1717",
    "#681515", "#761313", "#841111", "#921010", "#A00E0E",
    "#AE0C0C", "#BC0A0A", "#CA0808", "#B20710", "#b20710", "#ec8b06", "#ec8b90"
  ))
p1_plotly <- ggplotly(p1)
p1_plotly
```
 
Country
=====================================
## Column {.tabset data-width="650"}

### Country vs Types
```{r}
# Pre-process
df <- read.csv("data/netflix_titles.csv", na.strings = c("", "NA"))

grouped <- df %>%
  filter(!is.na(country) & country != "") %>% # remove the NA country
  filter(!is.na(date_added)) %>% # remove the NA date_added
  mutate(year_added = year(parse_date(date_added, "%B %d, %Y"))) %>% # extract year_added
  mutate(country = strsplit(as.character(country), ",")) %>%
  mutate(country = lapply(country, trimws)) %>%
  unnest(country) %>%
  group_by(country, year_added, type) %>%
  summarise(cnt = n()) %>%
  filter(!is.na(country) & country != "")

by_country_type <- grouped %>%
  group_by(country, type) %>%
  summarise(cnt = sum(cnt))

type_prop <- by_country_type %>%
  group_by(country) %>%
  mutate(prop = round(cnt / sum(cnt) * 100, 1)) %>%
  group_by(country) %>%
  summarise(total = sum(cnt), prop = prop, type = type) %>%
  as.data.frame() %>%
  top_n(20, wt = total)

custom_order <- type_prop %>%
  arrange(desc(ifelse(type == "Movie", prop, -prop))) %>%
  select(country) %>%
  array() %>%
  flatten() %>%
  unique()

ggplot(type_prop, aes(y = factor(country, levels = custom_order), x = prop, fill = type)) +
  geom_bar(stat = "identity") +
  geom_text(
    aes(label = scales::percent(prop / 100)),
    position = position_stack(vjust = 0.5),
    color = "white",
    size = 3
  ) +
  labs(
    title = "Proportions of Movie and TV Show by Country",
    y = "Country",
    x = "Proportion (%)",
    fill = "Type"
  ) +
  scale_x_continuous(labels = scales::percent_format(scale = 1), limits = c(0, 100)) +
  scale_fill_manual(values = c("#221f1f", "#b20710")) +
  theme_minimal()
```

# Keywords {.storyboard}

```{r}
library(tidyverse)
library(purrr) # for functional programming
library(wordcloud2) # for creating word cloud
df <- read.csv("data/netflix_titles.csv", na.strings = c("", "NA"))
netflix_color <- colorRampPalette(c("#221f1f", "#b20710", "#e50914"))(10)
```



```{r}
COMMON_WORDS <- c("a", "in", "at", "be", "of", "the", "an", "to", "on", "he", "she", "and", "his", "with", "her", "for", "their", "when", "this", "from", "as", "is", "by", "after", "that", "who", "but", "into", "up", "they", "him", "out", "must", "are", "about", "it", "its", "while", "one", "them", "where", "has", "more", "over", "have", "off", "two", "s")

extract_words <- function(sentences) {
  strsplit(sentences, "\\s+") %>%
    flatten_chr() %>%
    map(str_remove_all, "[^a-zA-Z\\s]") %>%
    map(tolower) %>%
    flatten_chr() %>%
    discard(function(x) x == "") %>% # remove empty string
    discard(function(x) x %in% COMMON_WORDS)
}

word_cloud <- function(data) {
  words <- extract_words(data$description)
  occur <- table(words)
  occur <- sort(occur, decreasing = T)[1:500]
  wordcloud2(occur, color = netflix_color)
}
```

### cloud1
```{r}
word_cloud(df %>% filter(2000 <= release_year & release_year < 2010))
```

### cloud2
```{r}
word_cloud(df %>% filter(2010 <= release_year & release_year < 2020))
```